{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### script to tag the articles and store the articles in a collection in "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pymongo import MongoClient"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bson.objectid import ObjectId"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "client = MongoClient(host='act4dgem.cse.iitd.ac.in')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "local_client = MongoClient()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['admin', 'config', 'ikb', 'local', 'media-db']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "local_client.database_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "tagged_articles_coll = local_client.get_database('media-db').get_collection('taggedArticles')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pymongo.results.DeleteResult at 0x7f2dbc495b08>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tagged_articles_coll.delete_many({})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['test', 'taggedArticles']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "local_client.get_database('media-db').list_collection_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "mediaDb = client.get_database('media-db')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "resolvedEntityDb = client.get_database('eventwise_media-db')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "articles = mediaDb.get_collection('articles')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "resolved_entities = resolvedEntityDb.get_collection('all_media_entities_resolved_final')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def updateText(objectId, text, entype, offset, length):\n",
    "    if entype == 'Person':\n",
    "        otag = '<PER>'\n",
    "        ctag = '</PER>'\n",
    "    elif entype == 'Company' or entype == 'Organization':\n",
    "        otag = '<ORG>'\n",
    "        ctag = '</ORG>'\n",
    "    #elif entype == 'Country' or entype == 'City' or entype =='ProvinceOrState' or entype =='Continent':\n",
    "    elif entype == 'City':\n",
    "        otag = '<LOC>'\n",
    "        ctag = '</LOC>'\n",
    "\n",
    "    elif entype == 'ProvinceOrState':\n",
    "        otag = '<STA>'\n",
    "        ctag = '</STA>'\n",
    "        \n",
    "    else :\n",
    "        otag = '<MISC>'\n",
    "        ctag = '</MISC>'\n",
    "    p1 = text[:offset]\n",
    "    p2 = otag + str(objectId) + ctag\n",
    "    p3 = text[offset+length:]\n",
    "\n",
    "    shift = len(p2) - length\n",
    "    return (p1 + p2 + p3, shift)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def updateOffset(entities, old_offset, shift):\n",
    "    #print 'Updated Offset'\n",
    "    for e in entities:\n",
    "        for inst in e['instances']:\n",
    "            #print inst\n",
    "            if inst['offset'] > old_offset:\n",
    "                inst['offset']+=shift\n",
    "    return entities\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def copyText():\n",
    "    global tagged_articles_coll\n",
    "    for a in tagged_articles_coll.find({}):\n",
    "        try:\n",
    "            tagged_articles_coll.update_one({'_id':ObjectId(a['_id'])}, {'$set':{'taggedText':a['text'], 'taggedEntities':a['entities']}})\n",
    "        except KeyError:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "def updateEntityData(entity):\n",
    "    global articles,tagged_articles_coll\n",
    "    articleIds = entity['articleIds']\n",
    "    names = entity['aliases']\n",
    "    \n",
    "    for articleId in articleIds:\n",
    "        article = tagged_articles_coll.find({'_id':ObjectId(articleId)})\n",
    "        if article.count() == 0:\n",
    "            print(articleId, 'article not found')\n",
    "            continue\n",
    "        article = article[0]\n",
    "        text = article['taggedText']\n",
    "        try:\n",
    "            entities = article['taggedEntities']\n",
    "        except KeyError:\n",
    "            print(articleId, 'no entities present')\n",
    "            continue\n",
    "        \n",
    "        for e in entities:\n",
    "            if e['name'] in names:\n",
    "                for inst in e['instances']:\n",
    "                    old_offset = inst['offset']\n",
    "                    try:\n",
    "                        (text, shift) = updateText(entity['_id'], str(text), e['type'], inst['offset'], inst['length'])\n",
    "                    except TypeError:\n",
    "                        print(text)\n",
    "                    entities = updateOffset(entities, old_offset, shift)\n",
    "        #print(articleId)\n",
    "        tagged_articles_coll.update_one(filter={'_id':ObjectId(articleId)}, update={'$set':{'taggedEntities':entities, 'taggedText':text}})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "copyText()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "for e in resolved_entities.find(no_cursor_timeout=True):\n",
    "    try:\n",
    "        updateEntityData(e)\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('resolvedArticles.txt', 'w') as f:\n",
    "    for a in taggedArticles:\n",
    "        print(taggedArticles[a]['taggedText'], file=f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!wc -l resolvedArticles.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "268094"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "resolved_entities.find({'graphid':{'$exists':True}}).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
